Linear regression with theoretical confidence intervals

Synthetic Experiement

Let’s explore how things work when the dependency between two variables is accurately characterized with the regression line.

set.seed(1)
n <- 100

E <- rnorm(n, mean=0, sd=1)
X <- rnorm(n, mean=0, sd=0.25)

Y <- 920 + 10*X + E # simple linear regression

simulated_data <- data.frame(Y=Y, X=X) # observed data

(out.lm <- lm(Y ~ X, data=simulated_data)) # add restrictions/assumptions
## 
## Call:
## lm(formula = Y ~ X, data = simulated_data)
## 
## Coefficients:
## (Intercept)            X  
##     920.109        9.996
ggplot(simulated_data, aes(x=X, y=Y)) + geom_point() + geom_smooth(formula = y~x, method=lm, se=FALSE)

Theoretical Uncertainty

ggplot(simulated_data, aes(x=X, y=Y)) + geom_point() + geom_smooth(formula = y~x, method=lm, se=TRUE)

  • Replicate this with 1000 times.
n <- 100 # sample size
n.reps <- 1000 # num of replications

theoretial_slope <- set_names(1:n.reps) %>% 
  map(~ tibble(X=rnorm(n, mean=0, sd=0.25), E=rnorm(n, mean=0, sd=1)) %>% mutate(Y = 920 + 10*X + E)  %>% summarize(slope=lm(Y~X)$coef[2]) %>% as.numeric()) %>% unlist()

my_kde_plot <- ggplot(data=data.frame(theoretial_slope=theoretial_slope), aes(x=theoretial_slope)) + geom_density(fill='gray', adjust=1.5)

confidence_interval <- ggplot_build(my_kde_plot)$data[[1]] %>% 
  filter(x > quantile(theoretial_slope, 0.025)) %>%
  filter(x < quantile(theoretial_slope, 0.975)) %>%
  # https://ggplot2.tidyverse.org/reference/geom_ribbon.html
  geom_area(mapping=aes(x=x,y=y, color='95% Confidence Interval'), 
            fill='red', alpha=0.25) 
  
my_kde_plot + confidence_interval + theme(legend.position="bottom") +
  ggtitle(paste0('If true slope was 10 and n=', n, sep=''))

  • This is how confidence intervals are made!
    • we did this with a large simulation approximation; but,
    • you can also “work out the math”

Confidence intervals/envelopes

  • regression coefficients
out.lm <- lm(income ~ age, data = incex)
confint(out.lm)
##                  2.5 %    97.5 %
## (Intercept) 894.114059 967.17031
## age           8.334461  10.01613
  • 95% confidence interval of predicted means

    • With a confidence of 95% the interval [1192; 1220] (in EUR) covers the true mean income for the population of 30 year old employees.
predict(out.lm, data.frame(age = 30), interval = 'confidence')
##        fit      lwr     upr
## 1 1205.901 1191.682 1220.12
  • 95% confidence envelope for a linear line
age.pre <- 18:65
inc.pre <- predict(out.lm, data.frame(age = age.pre), interval = 'confidence')
kable(head(inc.pre))
fit lwr upr
1095.798 1073.385 1118.210
1104.973 1083.304 1126.641
1114.148 1093.216 1135.080
1123.323 1103.120 1143.527
1132.499 1113.014 1151.983
1141.674 1122.899 1160.449
  • 95% confidence envelope plot
par(mar=c(4, 4, 0.5, 2)) # bottom, left, top, right
plot(income ~ age, data = incex, cex = .4)
abline(out.lm, col = 'blue', lwd = 2) # add linear reg.
lines(age.pre, inc.pre[, 2], col = 'red') # add lower limit
lines(age.pre, inc.pre[, 3], col = 'red') # add upper limit

ggplot2

  • use geom_smooth(..., se=TRUE, level=0.95) (or stat_smooth(..., geom = "smooth", se=TRUE, level=0.95))
ggplot(incex, aes(x=oexp, y=income)) + geom_point(alpha=0.2) + labs(x = 'Occupational Experience', y = 'Monthly Net Income') + 
        geom_smooth(method='lm', formula= y~x, col="red", 
                    se = TRUE, level = 0.95, fill = "pink") + theme_bw()

  • Draw confidence envelopes with a subset of the first 100 observations.

  • For linear regression, confidence intervals are affected by sample size \(N\), the deviation from the sample mean of X (i.e., \(X-\bar{X}\)), variance of \(X\), and the sample residual variance (variance error of the estimate/regression).

incex2 <- incex[1:100,]
ggplot(incex2, aes(x=oexp, y=income)) + geom_point(alpha=0.2) + labs(x = 'Occupational Experience', y = 'Monthly Net Income') + theme_bw() +
        geom_smooth(method='lm', formula= y~x, col="red", 
                    se = TRUE, level = 0.95, fill = "pink")

  • Add lower CIs and upper CIs manually using geom_ribbon
out.lm2 <- lm(income ~ oexp, incex2)
inc.pre2 <- data.frame(predict(out.lm2, interval = 'confidence'))
incex3 <- cbind(incex2, inc.pre2) # combine two datasets
kable(head(incex3))
sex age edu occ oexp income fit lwr upr
female 62 low low 35 953 1380.246 1307.878 1452.614
male 32 high high 6 1224 1203.481 1133.052 1273.909
male 56 med. high 36 1466 1386.341 1311.134 1461.549
female 63 med. med. 38 1339 1398.532 1317.440 1479.624
male 20 low low 3 1184 1185.195 1106.191 1264.198
female 38 med. med. 12 1196 1240.053 1184.069 1296.037
ggplot(incex3, aes(x=oexp, y=income)) + geom_point(alpha=0.2) + labs(x = 'Occupational Experience', y = 'Monthly Net Income') + theme_bw() + 
        geom_smooth(method='lm', formula= y~x, se = FALSE) +
        geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.3, fill="#69b3a2")

  • Add CIs and the fitted line manually using geom_ribbon and geom_line
ggplot(incex3, aes(x=oexp, y=income)) + geom_point(alpha=0.2) + labs(x = 'Occupational Experience', y = 'Monthly Net Income') + theme_bw() + 
        geom_line(aes(y=fit), col="red") +
        geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.3, fill="#69b3a2")

  • Two regression lines
ggplot(incex, aes(x=oexp, y=income, color=sex, fill=sex)) + geom_point(alpha=0.2) + labs(x = 'Occupational Experience', y = 'Monthly Net Income') + 
        geom_smooth(method='lm', formula= y~x, se = TRUE) + theme_bw()

ggplot(incex, aes(x=oexp, y=income, color=sex, fill=sex)) + geom_point(alpha=0.2) + labs(x = 'Occupational Experience', y = 'Monthly Net Income') + 
        geom_smooth(method='lm', formula= y~x, se = TRUE) + 
        facet_grid(~ sex) + theme_bw()

Linear regression with bootstrap confidence intervals

Bootstrap Uncertainty

n.reps <- 1000 # num of replicates
bootstraps <- set_names(1:n.reps) %>% 
  map(~ geom_line(data=sample_n(simulated_data, size=n, replace=TRUE),
                  mapping=aes(x=X, y=Y), 
                  stat="smooth", 
                  formula='y~x', method='lm', se=FALSE, alpha=0.5))

bootstraps[[1]] <- ggplot(simulated_data, aes(x=X, y=Y)) + geom_point() + bootstraps[[1]]

Reduce("+", bootstraps)

  • We treat the “sample” as the “population” and “(re)-sample” from that “population” over and over.
  • This is called “bootstrapping”: i.e., “we use what we have”
bootstrap_slope <- set_names(1:n.reps) %>% 
  map(~ sample_n(simulated_data, size=n, replace=TRUE) %>%  
  summarize(slope=lm(Y~X)$coef[2]) %>% as.numeric()) %>% unlist()

my_histogram_plot <- ggplot(data=data.frame(bootstrap_slope=bootstrap_slope), aes(x=bootstrap_slope)) + geom_histogram(bins=20, fill='gray')

bootstrap_interval <- ggplot_build(my_histogram_plot)$data[[1]] %>%
  filter(xmin > quantile(bootstrap_slope, 0.025)) %>%
  filter(xmax < quantile(bootstrap_slope, 0.975)) %>%
  geom_col(mapping=aes(y=y, x=(xmin+xmax)/2, 
                       color='95% bootstrapped\nconfidence interval'), 
           fill='red', alpha=0.25)

my_histogram_plot + bootstrap_interval + theme(legend.position="bottom")

  • We create bootstrap samples, and calculate our “statistic” for each bootstrap sample
  • Then we take the “middle 95%” and that’s a 95% bootstrap confidence interval

The boot package

There is a package boot with a function boot() that does the bootstrap for many situations. The function boot() requires three arguments: (1) the data from the original sample (here, incex); (2) a function to compute the statistics from the data where the first argument is the data and the second argument is the indices of the observations in the bootstrap sample; (3) the number of bootstrap replicates.

library(boot)
b.stat <- function(data, i)
{
   b.dat <- data[i ,]
   out.lm <- lm(income ~ oexp, b.dat)
   predict(out.lm, data.frame(oexp=incex2$oexp))   
}

incex2 <- incex[1:100,] # subset of the first 100 cases
b.out <- boot(incex2, b.stat, R = 1000) # R = num of replications

boot.ci(b.out, index = 1, type = 'perc') # 95% CI for the first observation
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = b.out, type = "perc", index = 1)
## 
## Intervals : 
## Level     Percentile     
## 95%   (1311, 1456 )  
## Calculations and Intervals on Original Scale
b.ci <- t(sapply(1:nrow(incex2), function(x) boot.ci(b.out, index = x, type = 'perc')$percent))[, 4:5]
dimnames(b.ci) <- list(rownames(incex2), c('lower', 'upper'))
kable(head(b.ci, 4))
lower upper
1311.324 1456.113
1144.457 1268.871
1315.804 1464.962
1323.975 1481.654

Plot with bootstrap confidence intervals

incex4 <- cbind(incex2, b.ci) # combine two datasets
ggplot(incex4, aes(x=oexp, y=income)) + geom_point(alpha=0.2) + labs(x = 'Occupational Experience', y = 'Monthly Net Income') + theme_bw() + 
        geom_smooth(method='lm', formula= y~x, se = FALSE) +
        geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.3, fill="#69b3a2")

Loess with confidence intervals

  • Scatterplot with loess regression
# fit loess
out.lss <- loess(income ~ oexp, data = incex, span = .2, degree = 1) # local LINEAR regression, degree = 1

# get predicted means
x.val <- seq(0, 48, by = .5)
y.pred <- predict(out.lss, data.frame(oexp = x.val))

plot(income ~ oexp, data = incex, cex = .4, xlab = 'Occup. Experience', 
     ylab = 'Monthly Net Income', main = 'loess (LOcal regrESSion)')

lines(x.val, y.pred, col = 'blue', lwd = 2)

  • We can add confidence intervals/envelopes for non-parametric, loess regression.

  • Get standard errors for predicted means and construct confidence intervals.

loess.pred <- predict(out.lss, x.val, se=TRUE)
str(loess.pred)
## List of 4
##  $ fit           : num [1:97] 1195 1187 1179 1171 1164 ...
##  $ se.fit        : num [1:97] 18.8 16.2 14.4 13.7 13.9 ...
##  $ residual.scale: num 234
##  $ df            : num 1911
plot(income ~ oexp, data = incex, cex = .4, xlab = 'Occup. Experience', 
     ylab = 'Monthly Net Income', main = 'loess (LOcal regrESSion)')
lines(x.val, y.pred, col = 'blue', lwd = 2)
lines(x.val, y.pred - qt(0.975, loess.pred$df)*loess.pred$se, lty=2, col="red")
lines(x.val, y.pred + qt(0.975, loess.pred$df)*loess.pred$se, lty=2, col="red")

  • Use ggplot.
ggplot(incex, aes(x=oexp, y=income)) + geom_point(alpha=0.2) + labs(x = 'Occupational Experience', y = 'Monthly Net Income') + 
        geom_smooth(method='loess', formula= y~x, col="red", 
                    se = TRUE, level = 0.95, fill = "#69b3a2", span=.2) + theme_bw()

Binary outcome: logistic regression

  • We now want to use binary outcome instead of continuous outcome. Let’s use SDS data (SCS_QE.sav).
scs$voc <- ifelse(scs$vm == 'Vocabulary', 1, 0) # factor to numeric

kable(head(scs[, c("numbmath", "vm", "voc")]))
numbmath vm voc
2 Mathematics 0
2 Mathematics 0
1 Mathematics 0
2 Vocabulary 1
2 Vocabulary 1
2 Vocabulary 1
  • You may want to jitter data points to avoid overlapping points.
par(mar=c(4, 4, 0.5, 2)) # bottom, left, top, right
plot(voc ~ jitter(numbmath), data = scs, xlim = c(0, 15), ylim = c(-.1, 1), 
   xlab = 'Number of math courses', ylab = 'Vocabulary training')

# fit models
out.lm <- lm(voc ~ numbmath, data = scs) # linear reg.
out.smth <- loess(voc ~ numbmath, scs) # loess reg
out.glm <- glm(vm ~ numbmath, family = 'binomial', data = scs) # logistic reg.
  • Add regression lines.
par(mar=c(4, 4, 0.5, 2)) # bottom, left, top, right
plot(voc ~ jitter(numbmath), data = scs, xlim = c(0, 15), ylim = c(-.1, 1), 
   xlab = 'Number of math courses', ylab = 'Vocabulary training')

abline(out.lm, lwd = 2) # add linear reg
lines(seq(0, 10, .1), predict(out.smth, data.frame(numbmath = seq(0, 10, .1))), col="blue", lwd = 2) # add loess
lines(0:15, predict(out.glm, data.frame(numbmath = 0:15), type = 'response'),
  lwd = 2, col = 'red', lty = 2) # add logistic reg.
legend('topright', c('Linear-probability model', 'loess', 'Logit model (logistic regression)'),
   col = c('black', 'blue', 'red'), lty = c(1, 1, 2), lwd = 2, bty = 'n')

  • Use ggplot.

    • To fit logistic regression in package ggplot2, you need to use a numeric outcome vector lying between 0 and 1.

    • Try with factor vm instead of numeric voc.

ggplot(scs, aes(x=numbmath, y=voc)) +
  geom_jitter(height = 0.05) +
  geom_smooth(method = "glm", method.args = list(family = "binomial")) + theme_bw()

  • You can fit a more flexible model. You can exercise more control and see whether it’s a good model or not.
ggplot(scs, aes(x=numbmath, y=voc)) +
  geom_jitter(height = 0.05) +
  geom_smooth(method = "glm", formula = y ~ splines::ns(x, 2), method.args = list(family = "binomial")) + theme_bw()